[1] 6
[1] 7
[1] 9
[1] 0 1 2 3 5 8
[1] 1 2 3 4 5 6 7 8 9 10 11
[1] 4 5 6 7 8 9 10 11 12 13 14
[1] 3
[1] 1 2 3 4 5 6 7 8 9 10
[1] 1 20 3 6
[1] 4 5 6 7 8 9 10 11 12 13
[1] 3 60 9 18
[1] "Emma" "Olivia" "Sophia"
[1] "2012-04-05" "2020-02-02"
[1] 12.0 62.3 7.0 101.0
[1] Female Female Female Male Male
Levels: Female Male
[1] TRUE FALSE FALSE
weight.kg <- c(60, 72, 57, 90, 95, 72)
height.meters <- c(1.75, 1.80, 1.65, 1.90, 1.74, 1.91)
bmi <- weight.kg/height.meters^2
bmi[1] 19.59184 22.22222 20.93664 24.93075 31.37799 19.73630
[1] 68 69 71 72
kid byear
1 JHC 68
2 CMC 69
3 REC 70
4 WBC 71
[1] "kid" "byear"
'data.frame': 4 obs. of 2 variables:
$ kid : Factor w/ 4 levels "CMC","JHC","REC",..: 2 1 3 4
$ byear: num 68 69 70 71
[1] JHC CMC REC WBC
Levels: CMC JHC REC WBC
[1] 69.5
[1] "bmi" "Boys" "byear" "height.meters"
[5] "my.character" "my.dates" "my.factor" "my.logical"
[9] "my.numeric" "weight.kg" "x" "y"
[13] "z"
[1] "as_factor" "format_tagged_na" "is.labelled" "is_tagged_na"
[5] "labelled" "labelled_spss" "na_tag" "print_labels"
[9] "print_tagged_na" "read_dta" "read_por" "read_sas"
[13] "read_sav" "read_spss" "read_stata" "read_xpt"
[17] "tagged_na" "write_dta" "write_sas" "write_sav"
[21] "write_xpt" "zap_empty" "zap_formats" "zap_label"
[25] "zap_labels" "zap_missing" "zap_widths"
id female inc80 inc81 inc82
1 1 0 5000 5500 6000
2 2 1 2000 2200 3300
3 3 0 3000 2000 1000
'data.frame': 3 obs. of 5 variables:
$ id : int 1 2 3
$ female: int 0 1 0
$ inc80 : int 5000 2000 3000
$ inc81 : int 5500 2200 2000
$ inc82 : int 6000 3300 1000
# A tibble: 6 x 5
pclass survived age fare gender
<dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl+lbl>
1 1 [First] 1 [Yes] 29 211. 1 [Female]
2 1 [First] 1 [Yes] 0.917 152. 0 [Male]
3 1 [First] 0 [No] 2 152. 1 [Female]
4 1 [First] 0 [No] 30 152. 0 [Male]
5 1 [First] 0 [No] 25 152. 1 [Female]
6 1 [First] 1 [Yes] 48 26.6 0 [Male]
$label
[1] "Passenger Class"
$format.spss
[1] "F8.0"
$class
[1] "haven_labelled"
$labels
First Second Third
1 2 3
library(openxlsx)
my.files <- list.files("c:/chuck/NYU/Stat Group/R Workshop/Many Files/",
pattern = ".xlsx$", full.names=TRUE, recursive=TRUE)
my.files # vector of the file names with full path[1] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File1.xlsx"
[2] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File2.xlsx"
[3] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File3.xlsx"
[4] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File4.xlsx"
[5] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File5.xlsx"
[6] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File6.xlsx"
[7] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File7.xlsx"
[8] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File8.xlsx"
my.sheets <- lapply(my.files, function(x){getSheetNames(x)}) # list of worksheet names
# within each MS-Excel workbook
unlist(my.sheets)[1] "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1"
[8] "Sheet 1"
my.sheets.DF <- data.frame(file = rep(my.files, unlist(lapply(my.sheets, length))), sheet = unlist(my.sheets))
my.sheets.DF file sheet
1 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File1.xlsx Sheet 1
2 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File2.xlsx Sheet 1
3 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File3.xlsx Sheet 1
4 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File4.xlsx Sheet 1
5 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File5.xlsx Sheet 1
6 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File6.xlsx Sheet 1
7 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File7.xlsx Sheet 1
8 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File8.xlsx Sheet 1
for(i in 1:nrow(my.sheets.DF)){
my.list[[i]] <- read.xlsx(xlsxFile = as.character(my.sheets.DF$file)[i],
sheet = as.character(my.sheets.DF$sheet)[i])
}[1] 80 2
library(car)
auto <- read_sav("c:/chuck/NYU/Stat Group/R Workshop/auto.sav")
summary(auto$MPG)
auto$MPG3 <- car::recode(auto$MPG, "lo:18=1; 19:23=2; 24:hi=3")
1 2 3
27 24 23
Two data frames with a common ID variable
A <- data.frame(famid = c(2,1,3),
name = c('Art','Bill','Paul'),
inc = c(22,30,25))
B <- data.frame(famid = c(3,1,2),
faminc96 = c(75,40,45),
faminc97 = c(76,40.5,40.4),
faminc98 = c(77,41,45.8))
intersect(names(A), names(B))[1] "famid"
One Merged Data Frame
famid name inc
1 2 Art 22
2 1 Bill 30
3 3 Paul 25
famid faminc96 faminc97 faminc98
1 3 75 76.0 77.0
2 1 40 40.5 41.0
3 2 45 40.4 45.8
famid name inc faminc96 faminc97 faminc98
1 1 Bill 30 40 40.5 41.0
2 2 Art 22 45 40.4 45.8
3 3 Paul 25 75 76.0 77.0
# A tibble: 6 x 19
ID SALBEG SEX TIME AGE SALNOW EDLEVEL WORK JOBCAT MINORITY SEXRACE
<dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
1 628 8400 Males 81 28.5 16080 16 0.25 Colle… White White …
2 630 24000 Males 73 40.3 41400 16 12.5 Exemp… White White …
3 632 10200 Males 83 31.1 21960 15 4.08 Exemp… White White …
4 633 8700 Males 93 31.2 19200 16 1.83 Colle… White White …
5 635 17400 Males 83 41.9 28350 19 13 Exemp… White White …
6 637 12996 Males 80 29.5 27250 18 2.42 Colle… White White …
# … with 8 more variables: EGENDER <dbl>, EMINORIT <dbl>, EGENXMIN <dbl>,
# BSALCENT <dbl>, CSALADJ <dbl>, SALDELTA <dbl>, RES_1 <dbl>, GENDER <fct>
The head() function is a convenient way to see the first few observations of a data object
With the subset() function
# A tibble: 2 x 5
ID SEX SALBEG AGE SALNOW
<dbl> <fct> <dbl> <dbl> <dbl>
1 926 Males 3600 53.5 12300
2 1077 Males 3900 29.2 9000
# A tibble: 2 x 5
ID SEX SALBEG AGE SALNOW
<dbl> <fct> <dbl> <dbl> <dbl>
1 926 Males 3600 53.5 12300
2 1077 Males 3900 29.2 9000
id female inc80 inc81 inc82
1 1 0 5000 5500 6000
2 2 1 2000 2200 3300
3 3 0 3000 2000 1000
id female year inc
1 1 0 inc80 5000
2 2 1 inc80 2000
3 3 0 inc80 3000
4 1 0 inc81 5500
5 2 1 inc81 2200
6 3 0 inc81 2000
7 1 0 inc82 6000
8 2 1 inc82 3300
9 3 0 inc82 1000
id female year inc
1 1 0 inc80 5000
2 2 1 inc80 2000
3 3 0 inc80 3000
4 1 0 inc81 5500
5 2 1 inc81 2200
6 3 0 inc81 2000
7 1 0 inc82 6000
8 2 1 inc82 3300
9 3 0 inc82 1000
id female inc80 inc81 inc82
1 1 0 5000 5500 6000
2 2 1 2000 2200 3300
3 3 0 3000 2000 1000
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
[1] "B" "C"
[1] "F" "J"
[1] "A" "D"
[1] "A" "B" "C" "D" "F" "J"
[1] TRUE
[1] FALSE FALSE TRUE TRUE
[1] 3
[1] "A" "B" "c" "D"
[1] "Item-1" "Item-2" "Item-3" "Item-4" "Item-5" "Item-6" "Item-7"
[8] "Item-8" "Item-9" "Item-10"
x <- c('fruit_apple','instrument_guitar','instrument_piano','fruit_orange','instrument_trumpet')
grepl("fruit", x)[1] TRUE FALSE FALSE TRUE FALSE
library(dplyr)filter()select()mutate()summarize()group_by()arrange() Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
5424 2011 1 1 6 1400 1500 AA 428
5425 2011 1 2 7 1401 1501 AA 428
5426 2011 1 3 1 1352 1502 AA 428
5427 2011 1 4 2 1403 1513 AA 428
5428 2011 1 5 3 1405 1507 AA 428
5429 2011 1 6 4 1359 1503 AA 428
TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
5424 N576AA 60 40 -10 0 IAH DFW 224
5425 N557AA 60 45 -9 1 IAH DFW 224
5426 N541AA 70 48 -8 -8 IAH DFW 224
5427 N403AA 70 39 3 3 IAH DFW 224
5428 N492AA 62 44 -3 5 IAH DFW 224
5429 N262AA 64 45 -7 -1 IAH DFW 224
TaxiIn TaxiOut Cancelled CancellationCode Diverted
5424 7 13 0 0
5425 6 9 0 0
5426 5 17 0 0
5427 9 22 0 0
5428 9 9 0 0
5429 6 13 0 0
'data.frame': 227496 obs. of 21 variables:
$ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ Month : int 1 1 1 1 1 1 1 1 1 1 ...
$ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
$ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
$ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
$ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
$ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
$ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
$ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
$ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
$ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
$ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
$ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
$ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
$ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
$ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
$ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
$ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
$ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
$ CancellationCode : chr "" "" "" "" ...
$ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011 1 1 6 1400 1500 AA 428
2 2011 1 2 7 1401 1501 AA 428
3 2011 1 3 1 1352 1502 AA 428
4 2011 1 4 2 1403 1513 AA 428
5 2011 1 5 3 1405 1507 AA 428
6 2011 1 6 4 1359 1503 AA 428
TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
1 N576AA 60 40 -10 0 IAH DFW 224
2 N557AA 60 45 -9 1 IAH DFW 224
3 N541AA 70 48 -8 -8 IAH DFW 224
4 N403AA 70 39 3 3 IAH DFW 224
5 N492AA 62 44 -3 5 IAH DFW 224
6 N262AA 64 45 -7 -1 IAH DFW 224
TaxiIn TaxiOut Cancelled CancellationCode Diverted
1 7 13 0 0
2 6 9 0 0
3 5 17 0 0
4 9 22 0 0
5 9 9 0 0
6 6 13 0 0
DepTime ArrTime AirTime
5424 1400 1500 40
5425 1401 1501 45
5426 1352 1502 48
5427 1403 1513 39
5428 1405 1507 44
5429 1359 1503 45
DepTime ArrTime ActualElapsedTime AirTime
5424 1400 1500 60 40
5425 1401 1501 60 45
5426 1352 1502 70 48
5427 1403 1513 70 39
5428 1405 1507 62 44
5429 1359 1503 64 45
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
head(select(g1, ActualElapsedTime, AirTime, ActualGroundTime)) ActualElapsedTime AirTime ActualGroundTime
1 60 40 20
2 60 45 15
3 70 48 22
4 70 39 31
5 62 44 18
6 64 45 19
summarize(hflights, min_dist = min(Distance),
mean_dist = mean(Distance),
median_dist = median(Distance),
sd_dist = sd(Distance),
max_dist = max(Distance)) min_dist mean_dist median_dist sd_dist max_dist
1 79 787.7832 809 453.6806 3904
hflights %>% group_by(UniqueCarrier) %>%
summarize(min_dist = min(Distance),
mean_dist = mean(Distance),
median_dist = median(Distance),
sd_dist = sd(Distance),
max_dist = max(Distance))# A tibble: 15 x 6
UniqueCarrier min_dist mean_dist median_dist sd_dist max_dist
<chr> <int> <dbl> <dbl> <dbl> <int>
1 AA 224 484. 224 353. 964
2 AS 1874 1874 1874 0 1874
3 B6 1428 1428 1428 0 1428
4 CO 140 1098. 1190 505. 3904
5 DL 689 723. 689 104. 1076
6 EV 468 776. 696 260. 1076
7 F9 666 883. 883 7.50 883
8 FL 490 685. 696 45.5 696
9 MQ 247 651. 247 448. 1379
10 OO 140 820. 809 300. 1428
11 UA 643 1178. 925 326. 1635
12 US 912 981. 913 110. 1325
13 WN 148 607. 453 399. 1642
14 XE 79 589. 562 281. 1201
15 YV 912 939. 913 79.6 1190
iris %>%
group_by(Species) %>%
summarize(Sepal.Length = mean(Sepal.Length),
Sepal.Width = mean(Sepal.Width),
Petal.Length = mean(Petal.Length),
Petal.Width = mean(Petal.Width))# A tibble: 3 x 5
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.01 3.43 1.46 0.246
2 versicolor 5.94 2.77 4.26 1.33
3 virginica 6.59 2.97 5.55 2.03
hflights %>% group_by(UniqueCarrier) %>%
summarize(min_dist = min(Distance),
mean_dist = mean(Distance),
median_dist = median(Distance),
sd_dist = sd(Distance),
max_dist = max(Distance)) %>%
arrange(mean_dist)# A tibble: 15 x 6
UniqueCarrier min_dist mean_dist median_dist sd_dist max_dist
<chr> <int> <dbl> <dbl> <dbl> <int>
1 AA 224 484. 224 353. 964
2 XE 79 589. 562 281. 1201
3 WN 148 607. 453 399. 1642
4 MQ 247 651. 247 448. 1379
5 FL 490 685. 696 45.5 696
6 DL 689 723. 689 104. 1076
7 EV 468 776. 696 260. 1076
8 OO 140 820. 809 300. 1428
9 F9 666 883. 883 7.50 883
10 YV 912 939. 913 79.6 1190
11 US 912 981. 913 110. 1325
12 CO 140 1098. 1190 505. 3904
13 UA 643 1178. 925 326. 1635
14 B6 1428 1428 1428 0 1428
15 AS 1874 1874 1874 0 1874
Without Piping
arrange(
summarize(
group_by(
filter(mtcars, carb > 1),
cyl),
Avg_mpg = mean(mpg)
),
desc(Avg_mpg)
)# A tibble: 3 x 2
cyl Avg_mpg
<dbl> <dbl>
1 4 25.9
2 6 19.7
3 8 15.1
With Piping
mtcars %>%
filter(carb > 1) %>%
group_by(cyl) %>%
summarize(Avg_mpg = mean(mpg)) %>%
arrange(desc(Avg_mpg))# A tibble: 3 x 2
cyl Avg_mpg
<dbl> <dbl>
1 4 25.9
2 6 19.7
3 8 15.1
hsb2 <- read_sav("https://clelandcm.github.io/Hands-On-R-Practice/hsb2.sav")
library(ggplot2)
ggplot(hsb2, aes(x = WRITE)) + geom_histogram(color = "grey", fill = "#34ebbd")library(openxlsx)
fname <- read.xlsx("c:/chuck/NYU/Stat Group/R Workshop/Female-Names.xlsx", sheet = 1)
fname Rank Name Ngirls
1 1 Emma 20799
2 2 Olivia 19674
3 3 Sophia 18490
4 4 Isabella 16950
5 5 Ava 15586
6 6 Mia 13442
7 7 Emily 12562
8 8 Abigail 11985
9 9 Madison 10247
10 10 Charlotte 10048
ggplot(fname, aes(x = forcats::fct_reorder(Name, Ngirls), y = Ngirls)) +
geom_bar(stat = 'identity') +
ylab('Number') + xlab('') +
ggtitle('Ten Most Common Female Baby\nNames in 2014') +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))ggplot2 package
Base R
library(ggplot2)
bank$GENDER <- as_factor(bank$GENDER)
ggplot(subset(bank, SALNOW < 30000), aes(x = SALBEG, y = SALNOW, color = GENDER)) +
geom_point()library(ggplot2)
ggplot(subset(bank, SALNOW < 30000), aes(x = SALBEG, y = SALNOW)) +
geom_point() +
facet_wrap( ~ GENDER)hsb <- read_dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
hsb <- as_factor(hsb)
ggplot(hsb, aes(x = female, y = write)) + geom_boxplot() + xlab("") +
theme(axis.text = element_text(size = 4),
axis.title = element_text(size = 5))library(igraph)
net.bg <- barabasi.game(80)
V(net.bg)$frame.color <- sample(c("white","black"), size = 80, replace=TRUE, prob=c(0.5,0.5))
V(net.bg)$color <- sample(c("orange","lightblue"), size = 80, replace=TRUE, prob=c(0.5,0.5))
V(net.bg)$label <- ""
V(net.bg)$size <- 10
E(net.bg)$arrow.mode <- 0
plot(net.bg, layout=layout.lgl)library(ggplot2) # devtools::install_github("hadley/ggplot2")
library(ggalt) # devtools::install_github("hrbrmstr/ggalt")
library(dplyr) # for data_frame() & arrange()
df <- data_frame(country=c("Germany", "France", "Vietnam", "Japan", "Poland", "Lebanon",
"Australia", "South\nKorea", "Canada", "Spain", "Italy", "Peru",
"U.S.", "UK", "Mexico", "Chile", "China", "India"),
ages_35=c(0.39, 0.42, 0.49, 0.43, 0.51, 0.57,
0.60, 0.45, 0.65, 0.57, 0.57, 0.65,
0.63, 0.59, 0.67, 0.75, 0.52, 0.48),
ages_18_to_34=c(0.81, 0.83, 0.86, 0.78, 0.86, 0.90,
0.91, 0.75, 0.93, 0.85, 0.83, 0.91,
0.89, 0.84, 0.90, 0.96, 0.73, 0.69),
diff=sprintf("+%d", as.integer((ages_18_to_34-ages_35)*100)))
df <- arrange(df, desc(diff))
df$country <- factor(df$country, levels=rev(df$country))
percent_first <- function(x) {
x <- sprintf("%d%%", round(x*100))
x[2:length(x)] <- sub("%$", "", x[2:length(x)])
x
}
gg <- ggplot()
gg <- gg + geom_segment(data=df, aes(y=country, yend=country, x=0, xend=1), color="#b2b2b2", size=0.15)
gg <- gg + geom_segment(data=df, aes(y=country, yend=country, x=ages_35, xend=ages_18_to_34), color="#b2b2b2", size=2)
gg <- gg + geom_point(data=df, aes(x=ages_35, y = country), size=3, colour = "#9fb059")
gg <- gg + geom_point(data=df, aes(x=ages_18_to_34, y = country), size=3, colour = "#edae52")
gg <- gg + geom_text(data=dplyr::filter(df, country=="Germany"),
aes(x=ages_35, y=country, label="Ages 35+"),
color="#9fb059", size=3, vjust=-1, fontface="bold", family="Calibri")
gg <- gg + geom_text(data=dplyr::filter(df, country=="Germany"),
aes(x=ages_18_to_34, y=country, label="Ages 18-34"),
color="#edae52", size=3, vjust=-1, fontface="bold", family="Calibri")
gg <- gg + geom_text(data=df, aes(x=ages_35, y=country, label=percent_first(ages_35)),
color="#9fb059", size=2.75, vjust=1.75, family="Calibri")
gg <- gg + geom_text(data=df, color="#edae52", size=2.75, vjust=1.75, family="Calibri",
aes(x=ages_18_to_34, y=country, label=percent_first(ages_18_to_34)))
gg <- gg + geom_rect(data=df, aes(xmin=1.05, xmax=1.175, ymin=-Inf, ymax=Inf), fill="#efefe3")
gg <- gg + geom_text(data=df, aes(label=diff, y=country, x=1.1125), fontface="bold", size=3, family="Calibri")
gg <- gg + geom_text(data=filter(df, country=="Germany"), aes(x=1.1125, y=country, label="DIFF"),
color="#7a7d7e", size=3.1, vjust=-2, fontface="bold", family="Calibri")
gg <- gg + scale_x_continuous(expand=c(0,0), limits=c(0, 1.175))
gg <- gg + scale_y_discrete(expand=c(0.075,0))
gg <- gg + labs(x=NULL, y=NULL, title="The social media age gap",
subtitle="Adult internet users or reported smartphone owners who\nuse social networking sites",
caption="Source: Pew Research Center, Spring 2015 Global Attitudes Survey. Q74")
gg <- gg + theme_bw(base_family="Calibri")
gg <- gg + theme(panel.grid.major=element_blank())
gg <- gg + theme(panel.grid.minor=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold"))
gg <- gg + theme(plot.subtitle=element_text(face="italic", size=9, margin=margin(b=12)))
gg <- gg + theme(plot.caption=element_text(size=7, margin=margin(t=12), color="#7a7d7e"))
gg id female race ses schtyp
Min. : 1.00 Min. :0.000 Min. :1.00 Min. :1.000 Min. :1.00
1st Qu.: 50.75 1st Qu.:0.000 1st Qu.:3.00 1st Qu.:2.000 1st Qu.:1.00
Median :100.50 Median :1.000 Median :4.00 Median :2.000 Median :1.00
Mean :100.50 Mean :0.545 Mean :3.43 Mean :2.055 Mean :1.16
3rd Qu.:150.25 3rd Qu.:1.000 3rd Qu.:4.00 3rd Qu.:3.000 3rd Qu.:1.00
Max. :200.00 Max. :1.000 Max. :4.00 Max. :3.000 Max. :2.00
prog read write math
Min. :1.000 Min. :28.00 Min. :31.00 Min. :33.00
1st Qu.:2.000 1st Qu.:44.00 1st Qu.:45.75 1st Qu.:45.00
Median :2.000 Median :50.00 Median :54.00 Median :52.00
Mean :2.025 Mean :52.23 Mean :52.77 Mean :52.65
3rd Qu.:2.250 3rd Qu.:60.00 3rd Qu.:60.00 3rd Qu.:59.00
Max. :3.000 Max. :76.00 Max. :67.00 Max. :75.00
science socst
Min. :26.00 Min. :26.00
1st Qu.:44.00 1st Qu.:46.00
Median :53.00 Median :52.00
Mean :51.85 Mean :52.41
3rd Qu.:58.00 3rd Qu.:61.00
Max. :74.00 Max. :71.00
Employment category
Frequency Percent
Clerical 227 47.890
Office trainee 136 28.692
Security officer 27 5.696
College trainee 41 8.650
Exempt employee 32 6.751
MBA trainee 5 1.055
Technical 6 1.266
Total 474 100.000
library(descr)
with(bank, CrossTable(SEX, JOBCAT %in% c("Clerical","Office trainee"),
prop.r=TRUE, prop.chi=FALSE, prop.t=FALSE, prop.c=FALSE, chisq=TRUE)). Cell Contents
. |-------------------------|
. | Count |
. | Row Percent |
. |-------------------------|
.
. ========================================
. JOBCAT %in% c("Clerical", "Office trainee")
. Sex of employee FALSE TRUE Total
. ----------------------------------------
. Males 101 157 258
. 39.1% 60.9% 54.4%
. ----------------------------------------
. Females 10 206 216
. 4.6% 95.4% 45.6%
. ----------------------------------------
. Total 111 363 474
. ========================================
.
. Statistics for All Table Factors
.
. Pearson's Chi-squared test
. ------------------------------------------------------------
. Chi^2 = 78.10967 d.f. = 1 p <2e-16
.
. Pearson's Chi-squared test with Yates' continuity correction
. ------------------------------------------------------------
. Chi^2 = 76.19681 d.f. = 1 p <2e-16
. Minimum expected frequency: 50.58228
vars n mean sd median trimmed mad min max range skew kurtosis
read 1 200 52.23 10.25 50 52.03 10.38 28 76 48 0.19 -0.66
write 2 200 52.77 9.48 54 53.36 11.86 31 67 36 -0.47 -0.78
math 3 200 52.65 9.37 52 52.23 10.38 33 75 42 0.28 -0.69
science 4 200 51.85 9.90 53 52.02 11.86 26 74 48 -0.19 -0.60
socst 5 200 52.41 10.74 52 52.99 13.34 26 71 45 -0.38 -0.57
se
read 0.72
write 0.67
math 0.66
science 0.70
socst 0.76
One Sample t-test
data: write
t = 4.1403, df = 199, p-value = 5.121e-05
alternative hypothesis: true mean is not equal to 50
95 percent confidence interval:
51.45332 54.09668
sample estimates:
mean of x
52.775
Paired t-test
data: write and read
t = 0.86731, df = 199, p-value = 0.3868
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.6941424 1.7841424
sample estimates:
mean of the differences
0.545
Two Sample t-test
data: write by female
t = -3.7341, df = 198, p-value = 0.0002463
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-7.441835 -2.298059
sample estimates:
mean in group 0 mean in group 1
50.12088 54.99083
Welch Two Sample t-test
data: write by female
t = -3.6564, df = 169.71, p-value = 0.0003409
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-7.499159 -2.240734
sample estimates:
mean in group 0 mean in group 1
50.12088 54.99083
hsb <- read_sav("c:/chuck/NYU/Stat Group/R Workshop/hsb2.sav")
round(cor(subset(hsb, select=c(READ,WRITE,MATH,SCIENCE,SOCST))), 2) READ WRITE MATH SCIENCE SOCST
READ 1.00 0.60 0.66 0.63 0.62
WRITE 0.60 1.00 0.62 0.57 0.60
MATH 0.66 0.62 1.00 0.63 0.54
SCIENCE 0.63 0.57 0.63 1.00 0.47
SOCST 0.62 0.60 0.54 0.47 1.00
Pearson's product-moment correlation
data: READ and SOCST
t = 11.163, df = 198, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5282957 0.6998781
sample estimates:
cor
0.6214843
# A tibble: 6 x 11
ID FEMALE RACE SES SCHTYP PROG READ WRITE MATH SCIENCE SOCST
<dbl> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl> <dbl> <dbl> <dbl> <dbl>
1 70 0 [male] 4 [whi… 1 [low] 1 [pub… 1 [gen… 57 52 41 47 57
2 121 1 [fema… 4 [whi… 2 [mid… 1 [pub… 3 [voc… 68 59 53 63 61
3 86 0 [male] 4 [whi… 3 [hig… 1 [pub… 1 [gen… 44 33 54 58 31
4 141 0 [male] 4 [whi… 3 [hig… 1 [pub… 3 [voc… 63 44 47 53 56
5 172 0 [male] 4 [whi… 2 [mid… 1 [pub… 2 [aca… 47 52 57 53 61
6 113 0 [male] 4 [whi… 2 [mid… 1 [pub… 2 [aca… 44 52 51 63 61
Call:
lm(formula = SCIENCE ~ MATH + FEMALE + SOCST + READ, data = hsb)
Residuals:
Min 1Q Median 3Q Max
-18.6706 -4.5764 -0.3237 4.5006 21.8563
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.32529 3.19356 3.859 0.000154 ***
MATH 0.38931 0.07412 5.252 3.92e-07 ***
FEMALE -2.00976 1.02272 -1.965 0.050820 .
SOCST 0.04984 0.06223 0.801 0.424139
READ 0.33530 0.07278 4.607 7.36e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.148 on 195 degrees of freedom
Multiple R-squared: 0.4892, Adjusted R-squared: 0.4788
F-statistic: 46.69 on 4 and 195 DF, p-value: < 2.2e-16
2.5 % 97.5 %
(Intercept) 6.02694246 18.62363584
MATH 0.24312201 0.53549835
FEMALE -4.02677214 0.00724283
SOCST -0.07288986 0.17257843
READ 0.19176512 0.47883447
| Variable | Description | Codes/Values |
|---|---|---|
| LOW | Low Birth Weight | 1 = BWT<=2500g, 0 = BWT>2500g |
| AGE | Age of Mother | Years |
| LWT | Weight of Mother at Last Menstrual Period | Pounds |
| RACE | Race | 1=White, 2=Black, 3=Other |
| FTV | First Trimester Physician Visits | 0=None, 1=One, 2=Two,etc. |
lowbwt <- read_sav('c:/chuck/NYU/Stat Group/R Workshop/lowbwt.sav')
lowbwt$RACE <- car::recode(lowbwt$RACE, "1='White';2='Black';3='Other'", as.factor = TRUE)
lowbwt$RACE <- relevel(lowbwt$RACE, ref = 'White')
Call:
glm(formula = LOW ~ AGE + LWT + RACE + FTV, family = "binomial",
data = lowbwt)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4163 -0.8931 -0.7113 1.2454 2.0755
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.295366 1.071443 1.209 0.2267
AGE -0.023823 0.033730 -0.706 0.4800
LWT -0.014245 0.006541 -2.178 0.0294 *
RACEBlack 1.003898 0.497859 2.016 0.0438 *
RACEOther 0.433108 0.362240 1.196 0.2318
FTV -0.049308 0.167239 -0.295 0.7681
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 222.57 on 183 degrees of freedom
AIC: 234.57
Number of Fisher Scoring iterations: 4
AGE LWT RACEBlack RACEOther FTV
0.9764586 0.9858564 2.7288979 1.5420434 0.9518876
2.5 % 97.5 %
AGE 0.9124232 1.0421301
LWT 0.9725636 0.9979524
RACEBlack 1.0232217 7.3225967
RACEOther 0.7567464 3.1462285
FTV 0.6771508 1.3109248
library(scholar)
library(purrr)
DF <- data.frame(Person = c("Chuck Cleland","Sam Friedman","Holly Hagan","Dani Ompad"),
GID = c("AuiR_rAAAAAJ","bQwCIGQAAAAJ","67LUQJUAAAAJ","Zxdn7DwAAAAJ"))
do.call(rbind, map(map(DF$GID, get_profile),
function(x){data.frame(Person = x$name,
Total.Cites = x$total_cites,
h.index = x$h_index,
i10.index = x$i10_index)})) Person Total.Cites h.index i10.index
1 Charles M. Cleland 6052 42 124
2 Samuel R Friedman 26259 85 349
3 Holly Hagan 40152 62 179
4 Danielle Ompad 5687 44 90
[1] 1 3 5 7 9
[1] 104 105 106 107 108 109 110 111 112 113 114
[1] "A" "B" "C" "A" "B" "C" "A" "B" "C"
[1] "A" "A" "A" "B" "B" "B" "C" "C" "C"
[1] 1.0 1.6 2.2 2.8 3.4 4.0
Person Visit
1 1 1
2 2 1
3 3 1
4 4 1
5 5 1
6 1 2
7 2 2
8 3 2
9 4 2
10 5 2
11 1 3
12 2 3
13 3 3
14 4 3
15 5 3
[1] 0.1608862 0.1912459 1.4058626 0.6708175 -1.0634520 0.4113199
[7] 2.4238979 -0.2514567 0.3252108 0.7818911
[1] 756.7219 363.0041 973.0582 810.7559 856.7343 522.0322 266.3716 772.0575
[9] 133.1941 477.6735
[1] 0 1 1 0 0 0 0 0 1 0
[1] "a" "a" "b" "a" "a" "a" "a" "b" "c" "a" "a" "a" "a" "a" "a" "b" "b" "a" "a"
[20] "a" "a" "a" "c" "a" "a" "b" "a" "a" "c" "b"
[1] 4 5 8 2 6 7 3 10 1 9
available.packages() functioninstall.packages() functionlibrary() function [1] "epi.2by2" "epi.about" "epi.asc" "epi.betabuster"
[5] "epi.bohning" "epi.ccc" "epi.ccsize" "epi.cluster1size"
[9] "epi.cluster2size" "epi.clustersize" "epi.cohortsize" "epi.conf"
[13] "epi.convgrid" "epi.cp" "epi.cpresids" "epi.descriptives"
[17] "epi.detectsize" "epi.dgamma" "epi.directadj" "epi.dms"
[21] "epi.dsl" "epi.edr" "epi.empbayes" "epi.equivb"
[25] "epi.equivc" "epi.herdtest" "epi.indirectadj" "epi.insthaz"
[29] "epi.interaction" "epi.iv"